Question 1 Discussed in Class, Skipped

Question 2 Kernal Regression

Part (A)

Show the least square estimator in one predictor case can be written into the linear smoother form of \(f(\hat{x^*})=\sum_{i=1} w_i(x_i,x^*)y_i\).
\[\begin{align} \hat(y) &=X(X^TX)^{-1}X^Ty= Hy\\ \hat{y_i} &= \text{the $i$th element of $\hat{y}$}\\ &= \text{the $i$th row of $H$ multiply vector $y$}\\ &= \sum_j H_{ij}y_j \\ &= \sum_j w(x_j,x) y_j \end{align}\]

\[\\\] When \(w_i=\frac{1}{K}\), \(f(\hat{x})=\sum_{i=1}^K\frac{1}{K}y_i=\bar{y_{i,i\in\{1,2,,K\}}}\), takes the average among its K closest samples (K nearest neighbours), has the smoothing effect among neighbours to lower the impact of outliers.

Part (B)

Write a R module with Gaussian Kernal

rm(list=ls())
set.seed(123)
library(modules)

#write module KernalRegression (kr)

kr <- module({
  import('stats')
  # given any x and h, calculate the kernal K
  K <- function(d,h){
    return(dnorm(x=d/h,mean=0,sd=1))
  }
  
  #input x and calculated normalized weights
  weights <- function(x,xi,h){
    w=K(x-xi,h)/h
    w=w/sum(w)
    return(w)
  }
  
  #calculate the weights use weights function and calculate y 
  knsmooth <- function(x_new,x,y,h){
    w=sapply(x_new, function(t) weights(x,t,h))
    yks <- t(w) %*% y 
    return(yks)
  }
})

Test it with simulations:

x <- runif(100,-3,3)
eps <- rnorm(length(x),0,1)
y <- 2*x^3+3*x^2+4*x+eps
y <- y-mean(y)

#create x_new for prediction and plot
x_new= seq(-3,3,length.out = length(x))

#use the knsmooth function from kr module on simulated x and y and predict x_new
yks_0 <- kr$knsmooth(x_new,x,y,0.01)
yks_1 <- kr$knsmooth(x_new,x,y,0.1)
yks_2 <- kr$knsmooth(x_new,x,y,0.5)
yks_3 <- kr$knsmooth(x_new,x,y,1)

#plot the fitted curve
plot(x,y,pch=20,main='KernelSmoother for Cubic x-y with Gaussian Error')
lines(x_new,yks_0,type='l',col='blue',lwd=2)
lines(x_new,yks_1,type='l',col='red',lwd=2)
lines(x_new,yks_2,type='l',col='orange',lwd=4)
lines(x_new,yks_3,type='l',col='green',lwd=2)
legend('topleft',c('h=0.01','h=0.1','h=0.5','h=1'),lty=1,col=c('blue','red','orange','green'))

## Question 3 Cross Validation

Part (A)

#create training and test dataset
x <- runif(100,0,1)
eps <- rnorm(length(x),0,1)
y <- 2*x^3+3*x^2+4*x+eps
train <- as.data.frame(cbind(x,y))

xtest <- runif(100,0,1)
epstest <- rnorm(length(xtest),0,1)
ytest <- 2*xtest^3+3*xtest^2+4*xtest+epstest
test <- as.data.frame(cbind(xtest,ytest),colnames=c(x,y))

# write a function cv for cross validation
# input training set, test test and h 
# output the kernal regression result of test set and error for each test y 
cv <- function(train,test,h){
  yks_test <- kr$knsmooth(test$x,train$x,train$y,h)
  err <- test$y-yks_test
  return(list(yks_test=yks_test,err=err))
}

# try a bunch of different h for cross validation error
h <- seq(0.001,1,length.out = 100)
err_h = sapply(h,function (t) cv(train,test,t)$err)

# the sum of errors in absolute values in for each h
plot(h,colMeans((err_h)^2),main='Cross Validation to choose h')

h[which.min(colMeans((err_h)^2))]
## [1] 0.08172727

Part (B)

#create noisy x1 and not so noisy x2
x1 <- runif(500,-5,5)
eps1 <- rnorm(length(x1),0,2)
x2 <- runif(500,-5,5)
eps2 <- rnorm(500,0,0.1)

#create wiggly function and smoothfunction for x1 and x2
y1w <- sin(x1)+eps1
y1s <- 2*x1^2+eps1

y2w <- sin(x2)+eps2
y2s <- 2*x2^2+eps2

par(mfrow=c(2,2))
plot(x1,y1w,main='noisy wigly')
plot(x2,y2w,main='not noisy wigly')
plot(x1,y1s,main='noisy smooth')
plot(x2,y2s,main='not noisy smooth')

par(mfrow=c(1,1))

# create the training and test dataset for x1 and x2, wiggly and smooth
xy1w <- as.data.frame(cbind(x1,y1w),colnames=c(x,y))
train1w <- xy1w[1:400,] 
test1w <- xy1w[401:500,]

xy1s <- as.data.frame(cbind(x1,y1s),colnames=c(x,y))
train1s <- xy1s[1:400,] 
test1s <- xy1s[401:500,]

xy2w <- as.data.frame(cbind(x2,y2w),colnames=c(x,y))
train2w <- xy2w[1:400,] 
test2w <- xy2w[401:500,]

xy2s <- as.data.frame(cbind(x2,y2s),colnames=c(x,y))
train2s <- xy2s[1:400,] 
test2s <- xy2s[401:500,]

# create a bunch of h we want to choose from 
h=seq(0.001,3,length.out = 100)

# choose the best h for x1 with wiggly function by minimizing the cross validation error
err_1w = sapply(h,function (t) cv(train1w,test1w,t)$err)
hbest_1w=h[which.min(colMeans(err_1w^2))]
hbest_1w
## [1] 0.5462727
# choose the best h for x2 with wiggly function by minimizing the cross validation error
err_2w = sapply(h,function (t) cv(train2w,test2w,t)$err)
hbest_2w=h[which.min(colMeans(err_2w^2))]
hbest_2w
## [1] 0.1221717
# choose the best h for x1 with smooth function by minimizing the cross validation error
err_1s = sapply(h,function (t) cv(train1s,test1s,t)$err)
hbest_1s=h[which.min(colMeans(err_1s^2))]
hbest_1s
## [1] 0.2433434
# choose the best h for x2 with smooth function by minimizing the cross validation error
err_2s = sapply(h,function (t) cv(train2s,test2s,t)$err)
hbest_2s=h[which.min(colMeans(err_2s^2))]
hbest_2s
## [1] 0.03129293
#plot the smoother and with its best bindwidth
x_grid=seq(-5,5,length.out = 500)
par(mfrow=c(2,2))
plot(x1,y1w,main='noisy wigly,h=1.06',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train1w$x,train1w$y,hbest_1w),col='red',lwd=2)
plot(x2,y2w,main='not noisy wigly,h=0.24',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train2w$x,train2w$y,hbest_2w),col='red',lwd=2)
plot(x1,y1s,main='noisy smooth,h=0.12',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train1s$x,train1s$y,hbest_1s),col='red',lwd=2)
plot(x2,y2s,main='not noisy smooth,h=0.03',pch=20)
lines(x_grid,kr$knsmooth(x_grid,train2s$x,train2s$y,hbest_2s),col='red',lwd=2)

par(mfrow=c(1,1))

Part (C) LOOCV

The drawbacks of cross validation (randomly dividing into two sets):
1. If the prediction method is expensive to train, cross-validation can be very slow since the training must be carried out repeatedly.
2. need to preserve the “total blinding” of the validation set from the training procedure, otherwise bias may result.
We have mentioned earlier that \(\hat{y}\) as a linear combination of \(y\), can be written with a linear smoothing matrix $H with \[\begin{align} \hat(y) &= Hy\\ \hat{y_i} &= \text{the $i$th element of $\hat{y}$}\\ &= \text{the $i$th row of $H$ multiply vector $y$}\\ &= \sum_j H_{ij}y_j \end{align}\]

Now that \[\hat{y^{-i}}=argmin_{j\neq i} w_j(y_j-\hat{y_j^{-i}})^2\] Define \[ Z=\left\{\begin{array}{cc} y_j & j\neq i \\ \hat{y_j^{-i}} & j=i \end{array} \right. \] Then, \(\hat{y_j^{-i}}\) is also \[\hat{y^{-i}}=argmin_{j\neq i} w_j(z_j-\hat{y_j^{-i}})^2\] Thus \[\hat{y^{-i}}= \sum_j H_{ij}z_j\] Subtracting \(\hat{y^{-i}}\) from \(\hat{y_i}\) we have, \[\hat{y_i}-\hat{y^{-i}}=\sum_j H_{ij}(y_j-z_j)=H_{ii}(y_i-\hat{y_j^{-i}})\] The last step holds since \(Y\) and \(Z\) only have one term different at \(j=i\).

Collecting the terms and write the above equation, we have \[\hat{y^{-i}}=\hat{y_i}-H_{ii}y_i+H_{ii}\hat{y^{-i}}\]

i.e., \[(1-H_{ii})\hat{y^{-i}}=\hat{y_i}-H_{ii}y_i\]

Thus, \[\begin{align} LOOCV &=\sum_i (y_i-\hat{y^{-i}})^2 \\ &= \sum_i \left(y_i-\frac{1}{1-H_{ii}}(\hat{y_i}-H_{ii}y_i)\right)^2 \\ &= \sum_i \left(\frac{y_i-\hat{y_i}}{1-H_{ii}}\right)^2 \end{align}\]

To implement it,

# write a function loocv
# input x y and parameter h
# output kernel smoothing result of test y and loocv error for each y in test set
loocv <- function(x,y,h){
  yks=kr$knsmooth(x,x,y,h)
  H= sapply(x, function(t) kr$weights(x,t,h))
  loocverr <- sapply(c(1:dim(H)[1]), function(i)  ((y[i]-yks[i])/(1-diag(H)[i]))^2)
  return(list(yks=yks,loocverr=loocverr))
}

#use the loocv funtion on the smooth function of x2 and the h set we created bfore and choose the h based on smallest loocv error
loocverr_x2y2s_h=sapply(h,function(t) loocv(x2,y2s,t)$loocverr)
loocverr_x2y2s_h=colSums(loocverr_x2y2s_h)/dim(loocverr_x2y2s_h)[1]
hbest_2s_loocv=h[which.min(loocverr_x2y2s_h)]
hbest_2s_loocv
## [1] 0.03129293

Question 4 Local Polynomial

Part A

\[g_x(x_i,a)=a_0+\sum_{k=1}^D a_k(x-x_i)^k\] where \(a=\{a_0,a_1,,,,,a_D\}\) \[\hat{a}=argmin \sum_i^n w_i (y_i-g_x(x_i,a))^2\] Define matrix \(R\) and \(W\) \[ R=\left[\begin{array}{ccccc} 1 &(x_1-x)^{1} &(x_1-x)^{2} &... & (x_1-x)^{D} \\ 1 &(x_2-x)^{1} &(x_2-x)^{2} &... & (x_2-x)^{D} \\ ... &... &... &... &... \\ 1 &(x_n-x)^{1} &(x_n-x)^{2} &... & (x_n-x)^{D} \\ \end{array} \right] \]

\[ W=\left[\begin{array}{ccccc} w_1 &0 &0 &... & 0 \\ 0 &w2 &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & w_n \\ \end{array} \right] \] Then, \[a=argmin (y-Ra)^T W (y-Ra)\] Take the derivative w.r.t to \(a\) and set to \(0\) \[\frac{\partial(y-Ra)^T W (y-Ra) }{\partial a}=-2R^Twy+2R^TwRa=0\] gives us \[\hat{a}= (R^T W R)^{-1} R^T W y\]

And \(\hat{f(x)}=g(x,a)=a_0=e_1^T (R^TWR)^{-1}R^TWy\)

Part B

When \(p=1\), \[ R=\left[\begin{array}{cc} 1 &(x_1-x) \\ 1 &(x_2-x) \\ ... &... \\ 1 &(x_n-x) \\ \end{array} \right] \] \[ W=\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \]

Then we have \[\begin{align} R^T W R &= \left[\begin{array}{ccccc} 1 &1 &1 &... &1 \\ (x_1-x) &(x_2-x) & &(x_3-x)... &(x_n-x) \end{array} \right] \\ & \quad \quad *\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \\ & \quad \quad *\left[\begin{array}{cc} 1 &(x_1-x) \\ 1 &(x_2-x) \\ ... &... \\ 1 &(x_n-x) \\ \end{array} \right] \\ &= \left[\begin{array}{cc} \sum_i K(\frac{x_i-x}{h}) &\sum_i K(\frac{x_i-x}{h})(x_i-x) \\ \sum_i K(\frac{x_i-x}{h})(x_i-x) &\sum_iK(\frac{x_i-x}{h})(x_i-x)^2 \end{array} \right] \\ &= \left[\begin{array}{cc} \sum_iw_i &s_1 \\ s_1 &s_2 \end{array} \right] \end{align}\] \[\begin{align} R^T W y= &= \left[\begin{array}{ccccc} 1 &1 &1 &... &1 \\ (x_1-x) &(x_2-x) & &(x_3-x)... &(x_n-x) \end{array} \right] \\ & \quad \quad *\left[\begin{array}{ccccc} K(\frac{x_1-x}{h}) &0 &0 &... & 0 \\ 0 &K(\frac{x_2-x}{h}) &0 &... & 0 \\ ... &... &... &... &... \\ 0 &0 &0 &... & K(\frac{x_n-x}{h} \\ \end{array} \right] \\ & \quad \quad *\left[\begin{array}{c} y_1 \\ y_2 \\ ... \\ y_n \\ \end{array} \right] \\ &= \left[\begin{array}{c} \sum_i K(\frac{x_i-x}{h})y_i \\ \sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \end{align}\] Plug into \(\hat{a}\), \[\begin{align} \hat{a} &= \left[\begin{array}{cc} \sum_iw_i &s_1 \\ s_1 &s_2 \end{array} \right] ^{-1} \left[\begin{array}{c} \sum_i K(\frac{x_i-x}{h})y_i \\ \sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \\ &= \left[\begin{array}{c} \frac{s_2}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})y_i-\frac{s_1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \\ -\frac{s_1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})y_i+\frac{1}{s_2\sum_iw_i-s_1^2}\sum_i K(\frac{x_i-x}{h})(x_i-x)y_i \end{array} \right] \end{align}\]

\[\hat{f(x)}=a_0=a[1,]=\frac{\sum_i K(\frac{x_i-x}{h})(s_2-s_1(x_i-x))y_i}{(\sum_iw_i)s_2-s_1^2}\]

which means \(\hat{f(x)}=a_0=a[1,]=\frac{\sum_i w_i y_i}{s_2\sum_iw_i-s_1^2}\) with unnormalized weights \(w_i=K(\frac{x_i-x}{h})(s_2-s_1(x_i-x)\)

Part C

\[\hat{f(x)}=a_0=e_1^T (R^TWR)^{-1}R^TWy = \frac{\sum_i K(\frac{x_i-x}{h})(s_2-s_1(x_i-x))}{(\sum_iw_i)s_2-s_1^2} y_i \] call the new weights \(l_i\) \[\hat{f(x)}=\sum_il_iy_i=l^Ty\] \[E(\hat{f(x)})=\sum_il_if(x_i)\] \[var(\hat{f(x)})=\sigma^2 ||l||_2^2\] If we assume normality of the residuals \(\epsilon\), \[\hat{f(x)} \sim N(\sum_il_i f(x_i),\sigma^2 ||l||_2^2)\]

Part D

\[\begin{align} E(\hat{\sigma^2}) &= \frac{E((y-Hy)^T(y-Hy))}{n-2tr(H)+2tr(H^TH)} \\ E((y-Hy)^T(y-Hy))&= E(y^Ty-2y^TH^Ty=y^TH^THy)\\ & =\sum_i(\sigma^2+f(x_i)^2)-2tr(H\Sigma)-2f(x)^THf(x)+tr(H^TH\Sigma)+f(x)^TH^THf(x)\\ & =(n-2tr(H)+tr(H^TH))\sigma^2+(f(x)-Hf(x))^T(f(x)-Hf(x)) \end{align}\]

so \[ E(\hat{\sigma^2}) =\sigma^2 +\frac{(f(x)-Hf(x))^T(f(x)-Hf(x))}{n-2tr(H)+2tr(H^TH)}\]

If the true function is close to the smoothing of the function, the bias is small. Also, if the n is very big, the bias can be neglected.

Part E

rm(list=ls())
library(modules)
set.seed(123)
setwd("/Users/qiaohui.lin/Desktop/StatModeling2/")
data=read.csv("utilities.csv",header = TRUE,sep=',')
y=data$gasbill/data$billingdays
x=data$temp
x_reg= cbind(rep(1,length(x)),x)

#local linear regression
llr <- module({
  import('stats')
  # calculate Kernal, input distannce and bandwidth, return kernal function value
  K <- function(d,h){
    return(dnorm(x=d/h,mean=0,sd=1))
  }
  #calculate s1 and s2 
  s2 <- function(x,xi,h){
    return(sum(K(xi-x,h)*(xi-x)^2))
  }
  s1 <- function(x,xi,h){
    return(sum(K(xi-x,h)*(xi-x)))
  }
  #calculate normalized weights
  weights <- function(x,xi,h){
    w <- K(xi-x,h) * (s2(x,xi,h)-(xi-x)*s1(x,xi,h))
    w <- w/sum(w)
    return(w)
  }
  #calculate prediction for one single points
  llr_single <- function(y,x,xi,h){
    w <- weights(x,xi,h)
    y_pred <- t(w) %*% y
    return(y_pred)
  }
  #calculate prediction for all points
  llr_all <- function(y,x,t,h){
    y_pred_all=sapply(t,function(i) llr_single(y,x,i,h))
    return(y_pred_all)
  }
})

#test with x[1] and h=1
y_pred_1=llr$llr_single(y,x,x[1],1)
y_pred_1
##          [,1]
## [1,] 3.969505
y_pred=llr$llr_all(y,x,x,1)
head(y_pred,10)
##  [1] 3.9695052 5.3426476 4.8696459 2.8793552 2.2915127 1.7903176 0.8970022
##  [8] 0.6122573 0.6122573 0.7516287
#choose h by loocv,
h=seq(0.5,10,length.out = 100)

#loocv function input x y bandwidth h, return predition on test set and error for each testset y
llr_loocv <- function(y,x,h){
  ypred=llr$llr_all(y,x,x,h)
  H= sapply(x,function(t) llr$weights(x,t,h))
  err <- sapply(c(1:dim(H)[1]), function(i)  ((y[i]-ypred[i])/(1-diag(H)[i]))^2  )
  return(list(ypred=ypred,err=err))
}

# choose the best h by applying loocv 
llr_loocv_err=sapply(h,function(t) llr_loocv(y,x,t)$err)
llr_loocv_err_h=colSums(llr_loocv_err)

hbest_llr_loocv=h[which.min(llr_loocv_err_h)]
hbest_llr_loocv
## [1] 6.833333
#plug in the best h 
y_pred=llr$llr_all(y,x,x,hbest_llr_loocv)
head(y_pred,10)
##  [1] 4.9222837 6.0184671 5.1589377 2.9884389 2.4797174 1.3284350 0.9724587
##  [8] 0.7261017 0.7261017 1.0871194

Part F

#plot prediction
plot(x,y_pred,col='red',pch=8,main='Local Linear Regression of y on x',xlab='x',ylab='y')
#create a new grid for x and its predictions for plotting the line
x_new=seq(5,80,by=0.1)
y_newpred=llr$llr_all(y,x,x_new,hbest_llr_loocv)
lines(x_new,y_newpred,col='red')
points(x,y,pch=20)
legend('topright',c('predicted','true'),lty=1,col=c('red','black'))

#plot residuals
plot(x,y-y_pred,main='Local Linear Regression Residuals',xlab='x',ylab='residual',ylim=c(-3,3))

#homoscesdasticity does not hold, looks like residuals get smaller when x gets bigger
#adjust the weights according to the variance

We are going to adjust the weights according to the variance observe the residual with respect to x. The new weights: \[w_i\propto K(x_i,x) *1/\sigma^2_i\]

#adjust the weights according to the variance
#observe the residual with respect to x
plot(x,log((y-y_pred)^2),main='log of  Residual Square',xlab='x',ylab='log of residual^2')

# we guess a function of variance in the form of 
x_qm=cbind(x,x^2)
lm_guess=lm(log((y-y_pred)^2)~x_qm)
lm_guess
## 
## Call:
## lm(formula = log((y - y_pred)^2) ~ x_qm)
## 
## Coefficients:
## (Intercept)        x_qmx         x_qm  
##   -1.479810     0.044177    -0.001261
#guess 
a=lm_guess$coefficients[1]
b=lm_guess$coefficients[2]
c=lm_guess$coefficients[3]
#\sigma^2=exp(0.67-0.08x)


llr_hetero <- module({
  import('stats')
  # calculate Kernal, input distannce and bandwidth, return kernal function value
  K <- function(d,h){
    return(dnorm(x=d/h,mean=0,sd=1))
  }
  #calculate s1 and s2 
  s2 <- function(x,xi,h){
    return(sum(K(xi-x,h)*(xi-x)^2))
  }
  s1 <- function(x,xi,h){
    return(sum(K(xi-x,h)*(xi-x)))
  }
  #calculate normalized weights
  weights_hetero <- function(x,xi,h,a,b,c){
    w <- K(xi-x,h)/(exp(a+b*x+c*x^2)) * (s2(x,xi,h)-(xi-x)*s1(x,xi,h))
    w <- w/sum(w)
    return(w)
  }
  #calculate prediction for one single points
  llr_single_hetero <- function(y,x,xi,h,a,b,c){
    w <- weights_hetero(x,xi,h,a,b,c)
    y_pred <- t(w) %*% y
    return(y_pred)
  }
  #calculate prediction for all points
  llr_all_hetero <- function(y,x,t,h,a,b,c){
    y_pred_all=sapply(t,function(i) llr_single_hetero(y,x,i,h,a,b,c))
    return(y_pred_all)
  }
})


llr_hetero_loocv <- function(y,x,h,a,b,c){
  ypred=llr_hetero$llr_all_hetero(y,x,x,h,a,b,c)
  H= sapply(x,function(t) llr_hetero$weights_hetero(x,t,h,a,b,c))
  err <- sapply(c(1:dim(H)[1]), function(i)  ((y[i]-ypred[i])/(1-diag(H)[i]))^2  )
  return(list(ypred=ypred,err=err))
}

# choose the best h by applying loocv 
llr_loocv_err_hetero=sapply(h,function(t) llr_hetero_loocv(y,x,t,a,b,c)$err)
llr_loocv_err_h_hetero=colSums(llr_loocv_err_hetero)

hbest_llr_loocv_hetero=h[which.min(llr_loocv_err_h_hetero)]
hbest_llr_loocv_hetero
## [1] 4.242424
#plug in the best h 
y_pred_hetero=llr_hetero$llr_all_hetero(y,x,x,hbest_llr_loocv_hetero,a,b,c)
head(y_pred_hetero,10)
##  [1] 4.8792355 5.9941894 5.0502596 2.8078812 2.2620922 1.1594569 0.8026374
##  [8] 0.7095204 0.7095204 0.8953889
#plot prediction
plot(x,y_pred_hetero,col='red',pch=8,main='Local Linear Regression,heteroscadesticity adjusted weight',xlab='x',ylab='y')
#create a new grid for x and its predictions for plotting the line
x_new=seq(5,80,by=0.1)
y_newpred_hetero=llr_hetero$llr_all_hetero(y,x,x_new,hbest_llr_loocv_hetero,a,b,c)
lines(x_new,y_newpred_hetero,col='red')
points(x,y,pch=20)
legend('topright',c('predicted','true'),lty=1,col=c('red','black'))

#plot residuals
plot(x,y-y_pred_hetero,main='Local Linear Regression Residuals,,heteroscadesticity adjusted weight',xlab='x',ylab='residual')

plot(x,log((y-y_pred_hetero)^2),main='Local Linear Regression Residuals,,heteroscadesticity adjusted weight',xlab='x',ylab='residual')

Part G

The CI for original fit

#estimate sigma^2 
n=length(y)
#calculate normalized weights l and its l2 norm
ls <- sapply(x, function(t) llr$weights(x,t,hbest_llr_loocv))
H=t(ls)
sig2 = sum((y-y_pred)^2)/(n-2*sum(diag(H))+sum(diag(t(H)%*%H)))
l2=sqrt(colSums(ls^2))
#var(f) is sigma^2 * ||l||_2^2
sig2f=l2^2*sig2

#gaussian 95% critical value 1.96
y_ub=y_pred+1.96*sqrt(sig2f)
y_lb=y_pred-1.96*sqrt(sig2f)

#do the same thing for the new x grid we create, to plot the line
ls_new=sapply(x_new, function(t) llr$weights(x,t,hbest_llr_loocv))
l2_new=sqrt(colSums(ls_new^2))
sig2f_new=l2_new^2*sig2
y_ub_new=y_newpred+1.96*sqrt(sig2f_new)
y_lb_new=y_newpred-1.96*sqrt(sig2f_new)

plot(x,y_pred,pch=8,col='red',main='95% CI for Predicted Value')
lines(x_new,y_newpred,col='red')
lines(x_new,y_ub_new,col='blue')
lines(x_new,y_lb_new,col='blue')
points(x,y,pch=20)

Gaussian Process

Part A

library(mvtnorm)
N=1000
x <- rnorm(N,0,1)
m <- rep(0,N)

#GP of 0 mean, Sqaure exponential Covariance for unit interval
gp_es <- function(x,n,b,tau1sq, tau2sq){
  m <- rep(0,length(x))
  C <- matrix(0,length(x),length(x))
  for(i in 1:length(x)) {
    for(j in 1:length(x)) {
      C[i,j] <- tau1sq*exp(-0.5*{(x[i]-x[j])/b}^2) + tau2sq*(x[i]==x[j])
    }
  }
  fx=rmvnorm(n,mean=m,sigma=C)
  return(fx)
}

gp_m52 <- function(x,n,b,tau1sq, tau2sq){
  m <- rep(0,length(x))
  C <- matrix(0,length(x),length(x))
  for(i in 1:length(x)) {
    for(j in 1:length(x)) {
      d=sqrt((x[i]-x[j])^2)
      C[i,j] <- tau1sq*(1+sqrt(5)*d/b+5*d^2/(3*b^2))*exp(-sqrt(5)*d/b)+tau2sq*(x[i]==x[j])
    }
  }
  fx=rmvnorm(n,mean=m,sigma=C)
  return(fx)
}

Varying \(b\) in \((0.01,0.1,0.5,1)\)

n=3
par(mfrow=c(4,1))
for (b in c(0.01,0.1,0.5,1)){
  gpes_fit=gp_es(x,n,b,0.5,10^(-6))
  plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square b=",b),type='l',ylim=c(-2,2))
  points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}

n=3
par(mfrow=c(4,1))
for (b in c(0.01,0.1,0.5,1)){
  gpm52_fit=gp_m52(x,n,b,0.5,10^(-6))
  plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 b=",b),type='l',ylim=c(-2,2))
  points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}

Varying \(\tau_1^2\) in \((0.01,0.1,0.5,1)\)

n=3
par(mfrow=c(4,1))
for (tau1sq in c(0.01,0.1,0.5,1)){
  gpes_fit=gp_es(x,n,0.5,tau1sq,10^(-6))
  plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square tau1sq=",tau1sq),type='l',ylim=c(-2,2))
  points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}

n=3
par(mfrow=c(4,1))
for (tau1sq in c(0.01,0.1,0.5,1)){
  gpm52_fit=gp_m52(x,n,0.5,tau1sq,10^(-6))
  plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 tau1sq=",tau1sq),type='l',ylim=c(-2,2))
  points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}

Varying \(\tau_2^2\) in \((10^(-6),0.01,0.1,0.5,1)\)

n=3
par(mfrow=c(4,1))
for (tau2sq in c(10^(-6),0.01,0.1,0.5)){
  gpes_fit=gp_es(x,n,0.5,0.5,tau2sq)
  plot(x[order(x)],gpes_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Exponential Square tau1sq=",tau1sq),type='l',ylim=c(-2,2))
  points(x[order(x)],gpes_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpes_fit[3,][order(x)],col='blue',type='l')
}

n=3
par(mfrow=c(4,1))
for (tau2sq in c(10^(-6),0.01,0.1,0.5)){
  gpm52_fit=gp_m52(x,n,0.5,0.5,tau2sq)
  plot(x[order(x)],gpm52_fit[1,][order(x)],col='red',xlab='x',ylab='fit',main=paste("Matern 52 tau2sq=",tau2sq),type='l',ylim=c(-2,2))
  points(x[order(x)],gpm52_fit[2,][order(x)],col='orange',type='l')
  points(x[order(x)],gpm52_fit[3,][order(x)],col='blue',type='l')
}

Part B

\[ \left( \begin{array}{c} f(x*) \\ f(x) \end{array} \right) \sim N\left( \left( \begin{array}{c} \mu^* \\ \mu \end{array} \right), \left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x) \end{array} \right) \right) \]

The \((i,j)\) entry of \(C(x,x)\) is applying covariance \(C\) function to \(x_i\), \(x_j\), i.e, \(C(x_i,x_j)\).\

For the ease of notation, Call \(f(x*)=f*\), \(f(x)=f\).\

The conditional of \(f^*|f\) is $f^*|f N(‘,’) $ where \[\mu' =\mu^* +C(x^*,x)C(x,x)^{-1}(f-\mu)\] \[\Sigma=C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*)\]

:\ We know \((f^*,f)\) is jointly normal, with density \[p(f^*,f) \propto exp\left(-\frac{1}{2}[(f^*-\mu^*)^T,(f-\mu)^T]\Sigma^{-1}\left[\begin{array}{cc} f^*-\mu^* \\ f-\mu \end{array}\right]\right)\] where \[\Sigma=\left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x) \end{array} \right)\] \[\Sigma^{-1}=\Omega=\left( \begin{array}{cc} \Omega_{11} & \Omega_{12} \\ \Omega_{21} & \Omega_{22} \end{array} \right) =\left[ \begin{array}{cc} (\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1} &-(\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1}\Sigma_{12}\Sigma_{22}^{-1}\\ -\Sigma_{22}^{-1}\Sigma_{12}^T(\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T)^{-1} &(\Sigma_{22}-\Sigma_{12}^T\Sigma_{11}^{-1}\Sigma_{12})^{-1} \end{array} \right] \]

working on a log scale, \[\log p(f^*,f) \propto -\frac{1}{2}[(f^*-\mu^*)^T,(f-\mu)^T] \left[\begin{array}{cccc} \Omega_{11} &\Omega_{12}\\ \Omega_{12}^T &\Omega_{22} \end{array}\right] \left[\begin{array}{cc} f^*-\mu^* \\ f-\mu \end{array}\right] \]

\[\log p(f^*,f) \propto -\frac{1}{2}\left[(f^*-f)^T\Omega_{11}(f^*-\mu^*)+2(f^*-\mu^*)^T\Omega_{12}(f-\mu)+(f-\mu)^T\Omega_{22}(f-\mu)\right]\]

We know the marginal distribution of \(f\) is \[p(f)=N(\mu,\Sigma_{22})\] Thus \[p(f^*|f)=\frac{p(f^*,f)}{p(f)}\] \[\begin{align} \log(p(f^*|f) \propto -\frac{1}{2}[&(f^*-\mu^*)^T\Omega_{11}(f^*-\mu^*)+2(f^*-\mu^*)^T\Omega_{12}(f-\mu)+(f-\mu)^T\Omega_{22}(f-\mu) \\ &-(f-\mu)^T\Sigma_{22}^{-1}(f-\mu) ]) \end{align}\]

Thus, \[f^*-\mu^*|f \sim N(\Omega_{11}^{-1}\Omega_{12}(f-\mu),\Omega_{11}^{-1})\] which means, \[f^*|f \sim N(\mu^*+\Omega_{11}^{-1}\Omega_{12}(f-\mu), \Omega_{11}^{-1})\]

Plug in the expression for \(\Omega\) and \(\Sigma\), \[\Omega_{11}^{-1}\Omega_{12}=\Sigma_{12}\Sigma_{22}^{-1}=C(x^*,x)C(x,x)^{-1}\] \[\Omega_{11}^{-1}=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{12}^T=C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*)\]

Put together, \[f^*|f \sim N(\mu^*+C(x^*,x)C(x,x)^{-1}(f-\mu),C(x^*,x^*)-C(x^*,x)C(x,x)^{-1}C(x,x^*))\]

Part C

\[p(y|\theta) \sim N(R\theta,\Sigma)\] so we can write \[y=R\theta+\epsilon\] \[\epsilon\sim N(0,\Sigma)\] Then the joint of \(y\) and \(\theta\) can be written as \[ \left( \begin{array}{c} y \\ \theta \end{array} \right) = \left( \begin{array}{c} R \\ I \end{array} \right)\theta+ \left( \begin{array}{c} I \\ 0 \end{array} \right) \epsilon \] Using Exercise 1 Question 1 Part 5 where we proved \[x=Ax_1+Bx_2\] \[x_1\sim MVN(\mu_1,\Sigma_1),x_2\sim MVN(\mu_2,\Sigma_2)\] Then the MGF for \(Y\) is: \[\begin{align} M_x(t) &= M_{Ax_1+Bx_2}(t)=E(e^{t^T(Ax_1+Bx_2)})\\ &= E(e^{(A^Tt)^Tx_1}e^{(B^Tt)^Tx_2})\\ &= M_{x_1}(A^T) M_{x_2}(B^T)\\ &= e^{t^TA\mu_1+\frac{1}{2}t^TA\Sigma_1A^Tt+t^TB\mu_2+\frac{1}{2}t^TB\mu_2+\frac{1}{2}t^TB\Sigma_2B^Tt}\\ &=e^{t^T(A\mu_1+B\mu_2)+\frac{1}{2}t^T(A\Sigma_1A^T+B\Sigma_2B^T)t} \end{align}\]

Thus we have, \[X\sim MVN(A\mu_1+B\mu_2,A\Sigma_1A^T+B\Sigma_2B^T)\] In this case where \[\left( \begin{array}{c} y \\ \theta \end{array} \right)=A\theta+B\epsilon\] where \[A=\left( \begin{array}{c} R \\ I \end{array} \right), B=\left( \begin{array}{c} I \\ 0 \end{array} \right) \] Thus, \[\left( \begin{array}{c} y \\ \theta \end{array} \right) \sim MVN(Am,AVA^T+B\Sigma B^T)\]

Nonparametric Regression

Part A

Use \(f_1\),,\(f_n\) to denote \(f(x_1)\).. \(f(x_n)\) for the ease of notation, \[\begin{align} p(f_1,,,f_n|y_{1:n}) &\propto \prod_{i=1}^n f(y_{1:n}|f_1,,,,f_n)p(f_1,,,f_n) \\ &\propto (\prod_{i=1}^n exp(-\frac{1}{2}(y_i-f_i)^2)) * (exp(-\frac{1}{2} f^TCf)) \\ & = exp((y-f)^T\Sigma^{-1}(y-f)+f^TC^{-1}f) \\ & = exp(f^T(\Sigma^{-1}+C^{-1})f-2f^T\Sigma^{-1}y) \\ & = MVN((\Sigma^{-1}+C^{-1})^{-1}\Sigma^{-1}y,(\Sigma^{-1}+C^{-1})^{-1}) \end{align}\]

Part B

\(cov(y_p,y_q)=C(x_p,x_q)+\sigma^2\delta_{pq}\), thus, \[cov(y)=C(x,x)+\sigma^2 I.\] Then the joint distribution is \[ \left( \begin{array}{c} f(x*) \\ y \end{array} \right) \sim N\left( \left( \begin{array}{c} 0 \\ 0 \end{array} \right), \left( \begin{array}{cc} C(x^*,x^*) & C(x^*,x) \\ C(x,x*) & C(x,x)+\sigma^2I \end{array} \right) \right) \] As we already derived in Question 4 Part B, the conditional distribution is \[f^*|y \sim N(C(x^*,x)(C(x,x)+\sigma^2I)^{-1}y,C(x^*,x^*)-C(x^*,x)*(C(x,x)+\sigma^2 I)^{-1}C(x,x^*))\] Thus, the posterior expectation can be written into the the form of the linear smoother as \[E(f^*|y)=C(x^*,x)(C(x,x)+\sigma^2I)^{-1}y=\sum_i \alpha_i K(x_i,x^*)\] where \[\alpha=(C(x,x)+\sigma^2 I)y\]

Part C

#read in data
rm(list=ls())
set.seed(123)
library(mvtnorm)
library(plot3D)
library(lattice)
setwd("/Users/qiaohui.lin/Desktop/StatModeling2/")
data=read.csv("utilities.csv",header = TRUE,sep=',')
y=data$gasbill/data$billingdays
x=data$temp
x=x+rnorm(length(x),0,1)
n=length(y)

# define kernal function
Exp2Sigma <- function(x,b,tau1sq, tau2sq){
  C <- matrix(0,length(x),length(x))
  for(i in 1:length(x)) {
    for(j in 1:length(x)) {
      C[i,j] <- tau1sq*exp(-0.5*{(x[i]-x[j])/b}^2) + tau2sq*(x[i]==x[j])
    }
  }
  return(C)
}


#initializing parameters
b=20
tau1sq=5
tau2sq=10^(-3)
C=Exp2Sigma(x,b,tau1sq,tau2sq)
sigma2=var(y)
Sigma=sigma2*diag(n)
Sigma_inv=(1/sigma2)*diag(n)

#calculate posterior mean and variance and uppler lower bound
post_mean=solve(Sigma_inv+solve(C))%*% Sigma_inv %*% y
post_var=solve(Sigma_inv+solve(C))
post_lb= post_mean-1.96*sqrt(diag(post_var))
post_ub= post_mean+1.96*sqrt(diag(post_var))

#plot
plot(x[order(x)],post_mean[order(x)],col='red',ylim=c(0,8),type='l')
points(x[order(x)],post_lb[order(x)],type='l',col='blue')
points(x[order(x)],post_ub[order(x)],type='l',col='blue')
points(x[order(x)],y[order(x)],pch=20)